********************************************************************************
**	PURPOSE: Uses the wide data set that has the survey variables included and creates a long by month data set
**							
**	INPUTS: Wide with survey.dta Index_Long.dta
**	
**	OUTPUTS: Long with survey.dta
**				
**	NOTES:
**
**	CREATED/MODIFED BY: Kayla Wilding, Hasan Ahamed
**
**	DATE CREATED: 8/22/2017
**
**	DATE LAST EDITED: 2/28/2023
********************************************************************************
*0. Open dataset
*1. Merge sample defining and survey indices into long data set
*2. Create interaction terms 
*3. Create outcome variables 
*4. Create indexes 
*5. Clean up and output long data with outcomes
*6. Reshape and merge outcome vars to wide data set  

/*******************************************************************************
1. Merge sample defining and survey indices into long data set
********************************************************************************/
	**Use long index
		use "$adta/Index_Long", clear 
		merge m:1 surveyid using "$adta/Sample and Baseline Vars.dta", keep(3) nogen 

/*******************************************************************************
2. Create interaction terms 
********************************************************************************/
	*Label these variables so I can create labels of the interaction terms
		label var  	ficoscore08 "FICO score"
		label var  	enc 		"CBL Group"
		ren open_sd18 op18
		label var 	op18 	"Opened a Loan"
		la var surveyid "Unique ID"
		label var index "Month in study"

	**Create outcome variable that is 1 = delinquent more than 30 days on their CBL 
		drop if index > 18
		gen cm_delinquent_rc = cm_delinquent // don't replace with 0 if they don't take up
	    replace cm_delinquent = . if op18 == 0
		replace cm_delinquent_rc = . if op18 == 0 //if they dont open a cbl within 18 months then info on delinquency of cbl should be missing (n/a)

		***DK 9/26 "change default @ 18 months throughout tables to be = 1 if default EVER during the 18 months, not just default in the final 12 months, for CBL." 

		bysort surveyid: egen cm30plus = max(cm_delinquent) // egen to be the maxof cm_delq at each month
		assert cm30plus == 1 if cm30plus[_n-1] == 1 & surveyid == surveyid[_n-1]
		assert cm30plus == 1 if cm30plus[_n+1] == 1 & surveyid == surveyid[_n+1]
		label var cm30plus "1 = Ever Delinquent on CBL"
		
		bysort surveyid: egen cm30plus_rc = max(cm_delinquent_rc) // egen to be the max of cm_delq at each month
		assert cm30plus_rc == 1 if cm30plus_rc[_n-1] == 1 & surveyid == surveyid[_n-1]
		assert cm30plus_rc == 1 if cm30plus_rc[_n+1] == 1 & surveyid == surveyid[_n+1]

		**Make sure the cm_reopen is 0 or 1 for someone in all months; need to look into where this variable was created and fix it; op18 is 1 for people that open one before and one after; so I want to limit the first open date to make sure its bigger than the survey date 
		drop cm_reopen 
		bysort surveyid : egen firstopendate = min(cm_opendate) if op18 == 1 
		bysort surveyid : egen secondopendate = max(cm_opendate) if op18 == 1
		*this will be reopened if they opened  one before and after as well
		gen cm_reopenr = (firstopendate != secondopendate) if (!mi(firstopendate) & !mi(secondopendate))
		bysort surveyid : egen cm_reopen = max(cm_reopenr)
		label var cm_reopen "Opened two cbls regardless of when in relation to survey date"

		*want to only mark those that open two during the time after surveyed ("ds" means during study)
		bysort surveyid : egen firstopendate_dsr = min(cm_opendate) if op18 == 1 & surveydate <= cm_opendate & !mi(cm_opendate) & !mi(surveydate)
		bysort surveyid : egen firstopendate_ds = min(firstopendate_ds)
			label var firstopendate_ds "Date of first CBL opened after survey date (in study period)"
		bysort surveyid : egen secondopendate_dsr = max(cm_opendate) if op18 == 1 & surveydate <= cm_opendate & !mi(cm_opendate) & !mi(surveydate)
		gen cm_reopen_dsr = (firstopendate_ds != secondopendate_ds) if (!mi(firstopendate_ds) & !mi(secondopendate_ds))
		bysort surveyid : egen cm_reopen_ds= max(cm_reopen_dsr)
			label var cm_reopen_ds "Opened two cbls in the 18 months after their survey date"

		**want the first open date that they opened a cbl in the 18 months after their survey date 	
		*note open_sd marks opened a loan at any time ; 

		**Make a variable that is the first date the person opened a cbl
		format cm_opendate %td
		format firstopendate_ds %td
		format secondopendate_ds %td
		format firstopendate %td 
		format secondopendate %td

		**Check that erveybody that is marked as having a cbl open during the study period has a corresponding first open date
		assert !mi(firstopendate) if op18 == 1
		gen openonspot = (surveydate == firstopendate_ds) if !mi(firstopendate) & !mi(surveydate)
		gen days_opened = firstopendate_ds - surveydate
		label var openonspot "Opened cbl on the same day as survey"
		label var days_opened "No. of days from survey date until they open their first cbl"

	* Create time dummy vars (aka "post") & interact with treatment var
		keep if inlist(index, 1, 6, 12, 18) 
		gen post = (index > 1)
		gen pt1 = (index == 6)
		gen pt2 = (index == 12)
		gen pt3 = (index == 18)
		
		*label
		la var post 	"Post"
		la var pt1 		"6 month endline"
		la var pt2 		"12 month endline"
		la var pt3 		"18 month endline"

		*standardize the baseline fico score variable 
		gen bficor = ficoscore08 if index == 1 
		bysort surveyid : egen bfico = max(bficor)
		label var bfico "Baseline FICO Score"
		
*Create all fico variables in hundreds
	foreach w in ficoscore08  {
		gen `w'h = `w'/100	
		local label1: variable label `w'
		gen label1 = "`label1'"
		replace label1 = subinstr(label1, "core","core (100s) ",.)
		local labelnew = label1
		label var `w'h "`labelnew'"
		drop label1
	}
	*I have to create this interactions as variables since i have to standardize them for the post double lasso for the selection table
		gen bfico_post = bfico*post
		gen fs081_400_post = fs081_400*post
		gen fs081_500_post = fs081_500*post
		gen fs081_600_post = fs081_600*post


/*******************************************************************************
*3. Create outcome variables 
********************************************************************************/
*A. Create vars in different units
*B. Index variables
*C. Individual outcomes

**Indices
		gen creditmiss = (recordtype == 14 | recordtype == 15 | recordtype == 16)
		label var creditmiss "Missing credit report"
	*Outcome variable if they are still a member at SLCCU 
		gen slccumem = (!mi(admin_pulldate))
		assert slccumem == 0 if mi(cm_delinquent) & op18 == 1

	**New credit index (inquiry 12, chgacct)
		*if they are missing information because they are no longer a member of slccu then we can infer their num of loans (admin count of loans) is zero 
		*numloans is the number of loans open from the admin data
		replace numloans = 0 if slccumem == 0 
		replace cm_open = 0 if slccumem == 0 

		gen numloansnocbl = numloans - cm_open //neither are ever missing
		replace numloansnocbl = 0 if numloansnocbl <  0 
		label var numloansnocbl "Number of Loans (excluding CBL)"
		sort surveyid index 
		gen chgacct = numaccts - numaccts[_n-1] if surveyid == surveyid[_n-1] 
		
		lab var chgacct "Change in number of accts last 6 mos"
		*changed from cm_open 
		replace cm_delinquent = 0 if mi(cm_delinquent) & op18 == 0 

		gen delqnocml = num_delinquencies - cm_delinquent
		replace delqnocml = 0 if delqnocml < 0
		gen bintotdelq = (delqnocml > 0) if !mi(delqnocml)

	*Payment history index (negnum30acctsdl, negnum90acctsdl, negnumcollect, negtotamtpd, negnumacctsderog)
		*Add to reshape variables
	
	*balance index (agbalrevolvtst agbalinstallnoauto, agbalautoloan)
		gen agbalinstallnoauto = agbalinstalltr - agbalautoloan  
		replace agbalinstallnoauto = 0 if agbalinstallnoauto < 0
		lab var agbalinstallnoauto "Balance of Installment Loans (excluding auto)"

	*utilization index (high_rev_util, openinstalltr, agbalrevolvbin, agbalautobin, agbalinstallnobin)
		gen rev_util = agbalrevolvtrst/aghicredoptrst
	
		gen high_rev_util= (rev_util >= .3) if !mi(rev_util)
		label var high_rev_util "1 = High Revolving Utilization"

		*outcome var that is an indicator of if you have a positive revolving loan bal
		gen agbalrevolvbin = (agbalrevolvtrst > 0) if !mi(agbalrevolvtrst)
		lab var agbalrevolvbin "1 = Has a positive revolving loan balance"

		gen agbalautobin = (agbalautoloan> 0) if !mi(agbalautoloan)
		lab var agbalautobin "1 = Has a positive auto loan balance"
		
		*outcome var that is an indicator of if you have a positive install loan
		gen agbalinstallnobin = (agbalinstallnoauto > 0) if !mi(agbalinstallnoauto)
		lab var agbalinstallnobin "1 = Has a positive installment loan balance (exlcuding auto)" 

	*credit mix index(bothtradelines)
		**Outcome variable dummy for 1 = the person has both and installment tradeline and a revolving tradeline open at baseline
		assert mi(openinstallB) if mi(opentradeB)
		assert mi(opentradeB) if mi(openinstallB)
		*They are always both missing so don't need to worry about obs with just one missing but not the other
		*Want to make sure that if they are missing credit information then they are missing this flag.

		gen bothtradelines = (openinstallB == 1 & opentradeB == 1) if (!mi(openinstallB) | !mi(opentradeB)) 

	*previous loans
          gen prevloans = numaccts-tradelines if !mi(numaccts)
          replace prevloans = 0 if prevloans < 0 
	 
	 *binary = 1 if utilization of revolving loans > 80%, 0 otherwise (and if no revolving loans, then score as 100%), 
              gen usecreditoptrstbr = usecreditoptrst if index == 1
               bysort surveyid: egen usecreditoptrstb = max(usecreditoptrstbr)
               gen opentradeBbr = opentradeB if index == 1
               bysort surveyid: egen opentradeBb = max(opentradeBbr)
               assert opentradeBb == opentradeB1
               replace usecreditoptrstb = 100 if opentradeBb == 0 
               gen high_revolv_util = (usecreditoptrstb > 0.8)  

	*default index (12 vars)"num30acctsdl18" "bin30acctsdl18" "num90acctsdl18" "bin90acctsdl18" "numcollect18" "bincollect18" "numacctsderog18" "binacctsderog18" "totamtpd18t" "bintotamtpd18" "delqnocml18" "bintotdelq18" 
		foreach var in 30acctsdl 90acctsdl collect acctsderog {
			gen bin`var' = (num`var'>0) if !mi(num`var')
		}
		gen bintotamtpd = (totamtpd > 0) if !mi(totamtpd)
		
***Other outcomes 

		replace depbal_me = 0 if mi(depbal_me) & mi(branch) & mi(admin_pulldate)  

	*1 = total deposits > 0 (bindepbal_me)
		gen bindepbal = (depbal_me > 0) if !mi(depbal_me) 

	*number of loans/has a loan thats not a cbl (used as an outcome in table 6)
		gen binnoncblslc = (numloansnocbl > 0) if !mi(numloansnocbl)
		lab var binnoncblslc "1 = Non-cbl loan"

	*Negate ficoscore for ROC curves for ranking interpretability 
		gen negficoscore08 = -ficoscore08 

	tempvar scoredfbaser
	gen `scoredfbaser' = scoredf if index == 1
	bysort surveyid : egen scoredfbase =  max(`scoredfbaser')
	label var scoredfbase "Scored at baseline"

	gen totamtpdt = totamtpd/1000
	label var totamtpdt "Total Amount Placed for Collection (1000s)"

	gen savingsbal_meh =  savingsbal_me/100
	replace savingsbal_meh = 0 if mi(savingsbal_meh) & slccumem == 0 
	replace savingsbal_me = 0 if mi(savingsbal_me) & slccumem == 0
	label var savingsbal_meh "Savings balance (hundreds)"
	gen savcheck_meh = savcheck_me/100
	replace savcheck_meh = 0 if mi(savcheck_meh) & slccumem == 0
	replace savcheck_me = 0 if mi(savcheck_me) & slccumem == 0  
	label var savcheck_meh "Savings and checkings balance (hundreds)"

	ihstrans savingsbal_meh savcheck_meh

	foreach var in savingsbal_meh savcheck_meh slccumem binnoncblslc {
		gen `var'br = `var' if index == 1	
		bysort surveyid : egen `var'b = max(`var'br) 
		if "`var'" == "savingsbal_meh" | "`var'" == "savcheck_meh" {
			winsor `var'b, p(0.05) highonly gen(`var'b95)
			winsor `var', p(0.05) highonly gen(`var'95)
			winsor `var', p(0.01) highonly gen(`var'99)
		}
		
	}

/*******************************************************************************
*4. Create indexes 
********************************************************************************/

local indices $newcredit $amountsowed $utilization $default_index_all $default_index_ser $crosssell $contknow $liqcf 
	foreach var in `:list uniq indices' {
		sum `var' if index == 1 & flag_extragroup == 1			//grab the mean and sd for the control group at baseline
		local mean `r(mean)'
		local sd `r(sd)'
		gen z_`var' = ((`var'- `mean')/`sd') 
	}


foreach index in newcredit amountsowed utilization default_index_all default_index_ser default_index_f4 default_index_l6 crosssell contknow liqcf  {
	foreach svar in $`index' {
		local z_`index' `z_`index'' z_`svar'
	}

		*Create index from standardized variable
		egen `index'_i = rowmean(`z_`index'')
		
		*Standardize the index here  
		sum `index'_i if index == 1 & flag_extragroup == 1
		local mean `r(mean)'
		local sd `r(sd)'
		gen z_`index'_i = ((`index'_i- `mean')/`sd') 

		*Add to reshape variable local so they will get added to the wide data set
}


/*******************************************************************************
4. Create outcomes based on indices
********************************************************************************/	
	
*We want a binary outcome that is 1 if the obs is higher than the median value of the index for ROC curves
	tempvar medianin 
	bysort index: egen `medianin' = median(z_default_index_all_i)
	gen binindex = (default_index_all > `medianin') if (!mi(default_index_all))

*winsorized versions 
	foreach var in  z_crosssell_i {
		winsor `var', p(0.01) highonly gen(`var'99)
		winsor `var', p(0.05) highonly gen(`var'95)
}

	foreach var in z_amountsowed_i z_utilization_i z_default_index_all_i z_newcredit_i z_liquid_i {
		gen `var'br = `var' if index == 1	
		bysort surveyid : egen `var'b = max(`var'br) 
		if "`var'" == "savingsbal_meh" | "`var'" == "savcheck_meh" {
			winsor `var'b, p(0.05) highonly gen(`var'b95)
		}
	}

/*******************************************************************************
*5. Clean up and output long data with outcomes
********************************************************************************/
	*******Label variables  ////note to self-- need to clean these up still
	la var openinstalltr "Number of Installment Loans"
	la var openinstallB "Open Installment Loan"
	la var agbalinstalltr "Balance of Installment Loans"
	la var openrevolvtr "Number of Revolving Loans"
	la var opentradeB "Open Revolving Loan"
	la var agbalrevolvtrst "Balance of Revolving Loans"
	la var bothtradelines "Has both Installment and Revolving Tradelines == 1"
	la var newcred "New Tradeline Since Baseline"
	la var depbal_me "Total Deposits Balance"
	la var bindepbal "Total Deposits Balance > 0 = 1"
	la var op18 "Opened a CBL"
	la var bintotamtpd "1= Has amount placed for collection"
	la var bin30acctsdl "1= Has acct 30+ dpd"
	la var bin90acctsdl "1= Has acct 90+ dpd"
	la var bincollect "1 = Has acct in collection"
	la var binacctsderog "1= Has accts with derog event"
	la var binindex "1 = Higher than median on delinquency index"
	la var age "Age"
	la var female "Female"
	la var married "Married"
	la var adults "# Adults in HH"
	la var child "# Children in HH"
	la var race_black "Race - Black"
	la var slccumem "1= Still SLCCU member"
	la var z_newcredit_i "New credit index (standardized)"

	la var z_amountsowed_i "Amounts Owed: Balances Index (standardized)"
	la var z_utilization_i "Amounts Owed: Utilization index (standardized)"
	la var z_default_index_all_i "Default index (standardized)"
	la var z_default_index_ser_i "Serious default index (standardized)"
	la var z_crosssell_i "Cross sell index (standardized)"
	label var ficoscore08h "FICO Score (100s)"
	la var negficoscore08 "Negative ficoscore for ROCs"
	la var bintotdelq "1 = Delq no cml"
	la var binnoncblslcb "1 = Non-cbl loan (at baseline)"
	la var slccumemb "1= Still SLCCU member (at baseline)"
	la var z_default_index_all_ib "Default index (standardized, at baseline)"
	la var z_newcredit_ib "New credit index (standardized, at baseline)"
	la var z_liquid_ib "Liquidity index (standardized, at baseline)"
	la var z_amountsowed_ib "Amounts Owed: Balances Index (standardized, at baseline)"
	la var savingsbal_mehb "Savings balance (hundreds, at baseline)"
	la var savcheck_mehb "Savings and checkings balance (hundreds, at baseline)"
	la var z_credac_rev_i "Revolving credit activity at baseline index"
	la var prevloans "Number of prior loans, lifetime"
	

		*Define value labels
		label define fico ///
			1 "Amount owed on accounts is too high" ///
			2 "Level of delinquency on accounts" ///
			3 "Too few bank/national revolving accounts" ///
			4 "Too many bank/national revolving accounts" ///
			5 "Too many accounts with balances" ///
			6 "Too many consumer finance company accounts" ///
			7 "Account payment history is too new to rate" ///
			8 "Too many inquiries last 12 months" ///
			9 "Too many account recently opened" ///
			10 "Ratio of balance to limit on bank revolving or other rev accts is too high" ///
			11 "Amount owed on revolving accounts is too high" ///
			12 "Length of time revolving accounts have been established" ///
			13 "Time since deliquency is too recent or unknown" ///
			14 "Length of time accounts have been established" ///
			15 "Lack of recent bank/national revolving information" ///
			16 "Lack of recent revolving account information" ///
			17 "No recent non-mortgage balance information" ///
			18 "Number of accounts with delinquency" ///
			19 "Too few accounts currently paid as agreed" ///
			20 "Time since derogatory public record or collection is too short" ///
			21 "Amount past due on accounts" ///
			24 "No recent revolving balances" ///
			25 "Length of time installment loans have been established" ///
			26 "Number of revolving accounts" ///
			28 "Number of established accounts" ///
			30 "Time since most recent account opening is too short" ///
			31 "Too few accounts with recent payment information" ///
			32 "Lack of recent installment loan information" ///
			33 "Proportion of loan balances to loan amounts is too high", modify
		
		label define fico ///
			38 "Serious delq, & public record or collection filed" ///
			39 "Serious delinquency" ///
			40 "Derogatory public record or collection filed", add 

		*relabel FICO score reasons values
		forval x = 1/5 {
			label values ficors`x' fico
		}

		label var delqnocml "Number of Delinquencies (minus cbl delq)"

 
	local reshapevars encpd ficoscore08 $revvars $insecurity $selfcont $liquid  ///
	$risk $credstatus $credknow $indices_s $contknow $liqcf $crosssell $creditmix ///
	$utilization $amountsowed $default_index_all $newcredit scoredf tradelines race_white ///
	yearsslccu savingsbal_me numloans hasloan_nocm $baselinechars $indices_ac ///
	savingsbal_me savingsbal_meh savingsbal_mehb savingsbal_mehb95  savingsbal_meh95 ///
	savingsbal_meh99 savcheck_me savcheck_meh savcheck_mehb savcheck_mehb95  ///
	savcheck_meh95 savcheck_meh99 ficoscore08h totamtpdt negficoscore08 binindex ///
	binnoncblslc prevloans z_default_index_f4 z_default_index_l6
	
	local reshapevars : list uniq reshapevars
	cap drop __00000*

	save "$adta/Long with outcomes.dta", replace 

/*******************************************************************************
*6. Reshape and merge outcome vars to wide data set  
********************************************************************************/
	
	*Keep the outcome variables that we want to reshape as well as the identifying variables needed for the reshape command
	keep surveyid index op18 open_sd open_sd30d open_sd180d open_sd365d open_sdsame `reshapevars' $htegroups

	*Reshape drops value and variable labels so we need to save these before reshaping so they can be re-labeled easily
	unab allvars: `reshapevars'
	foreach var in `allvars' {
		* pull variable label
		loc l`var' : var lab `var'
		* pull value label, if exists
		loc lab_`var': value label `var'
	}	

	*Reshape the outcome variabels that I created to be wide so we can merge to the wide data set  open_sd30d open_sd180d
	reshape wide `reshapevars', i(surveyid htca ltca op18 open_sd open_sd30d open_sd180d open_sd365d) j(index)

	* Re-label variables and values
	foreach i in 1 6 12 18 {
		foreach var in `allvars' {
			if `i' != 1 {
				label var `var'`i' "`l`var'', `i' Months"
				label val `var'`i' `lab_`var''
			}
			else if `i' == 1 {
				label var `var'`i' "`l`var'', Baseline"
				label val `var'`i' `lab_`var''
			} 
		}
	}

	**Merge the reshaped outcome vars to the wide data set
	merge 1:1 surveyid using "$adta/Sample and Baseline Vars.dta", ///
	assert(3) keepusing(flag_randomized  $groupflags $baselineoutcomes enc svymiss inclt30 incgt30 ne_ln)

	*Save the wide data set with the outcome variables
	save "$adta/Wide with outcomes.dta", replace

**EOF**
